In [2]:
    
%matplotlib inline
import matplotlib.pyplot as plt
    
In [3]:
    
dtt = []
with open('/Users/chengjun/github/cjc2016/data/tianya_bbs_threads_network.txt', 'r') as f:
    for line in f:
        pnum, link, time, author_id, author, content = line.replace('\n', '').split('\t')
        dtt.append([pnum, link, time, author_id, author, content])
len(dtt)
    
    Out[3]:
In [4]:
    
import pandas as pd
dt = pd.DataFrame(dtt)
dt=dt.rename(columns = {0:'page_num', 1:'link', 2:'time', 3:'author',4:'author_name', 5:'reply'})
dt[:5]
    
    Out[4]:
In [5]:
    
# extract date from datetime
date = map(lambda x: x[:10], dt.time)
dt['date'] = pd.to_datetime(date)
    
In [19]:
    
dt[:5]
    
    Out[19]:
In [7]:
    
import pandas as pd
df = pd.read_csv('/Users/chengjun/github/cjc2016/data/tianya_bbs_threads_list.txt', sep = "\t", header=None)
df=df.rename(columns = {0:'title', 1:'link', 2:'author',3:'author_page', 4:'click', 5:'reply', 6:'time'})
df[:2]
    
    Out[7]:
In [8]:
    
from collections import defaultdict
link_user_dict = defaultdict(list)
for i in range(len(dt)):
    link_user_dict[dt.link[i]].append(dt.author[i])
    
In [12]:
    
df['user'] = [len(link_user_dict[l]) for l in df.link]
df[:2]
    
    Out[12]:
In [18]:
    
import statsmodels.api as sm
import numpy as np
x = np.log(df.user+1)
y = np.log(df.reply+1)
xx = sm.add_constant(x, prepend=True)
res = sm.OLS(y,xx).fit()
constant,beta = res.params
r2 = res.rsquared
fig = plt.figure(figsize=(8, 4),facecolor='white')
plt.plot(df.user, df.reply, 'rs', label= 'Data')
plt.plot(np.exp(x), np.exp(constant + x*beta),"-", label = 'Fit')
plt.yscale('log');plt.xscale('log')
plt.xlabel(r'$Users$', fontsize = 20)
plt.ylabel(r'$Replies$', fontsize = 20)
plt.text(max(df.user)/300,max(df.reply)/20,
         r'$\beta$ = ' + str(round(beta,2)) +'\n' + r'$R^2$ = ' + str(round(r2, 2)))
plt.legend(loc=2,fontsize=10, numpoints=1)
plt.axis('tight')
plt.show()
    
    
In [14]:
    
x = np.log(df.user+1)
y = np.log(df.click+1)
xx = sm.add_constant(x, prepend=True)
res = sm.OLS(y,xx).fit()
constant,beta = res.params
r2 = res.rsquared
fig = plt.figure(figsize=(8, 4),facecolor='white')
plt.plot(df.user, df.click, 'rs', label= 'Data')
plt.plot(np.exp(x), np.exp(constant + x*beta),"-", label = 'Fit')
plt.yscale('log');plt.xscale('log')
plt.xlabel(r'$Users$', fontsize = 20)
plt.ylabel(r'$Replies$', fontsize = 20)
plt.text(max(df.user)/300,max(df.click)/20,
         r'$\beta$ = ' + str(round(beta,2)) +'\n' + r'$R^2$ = ' + str(round(r2, 2)))
plt.legend(loc=2,fontsize=10, numpoints=1)
plt.axis('tight')
plt.show()
    
    
In [9]:
    
# convert str to datetime format
dt.time = pd.to_datetime(dt.time)
dt['month'] = dt.time.dt.month
dt['year'] = dt.time.dt.year
dt['day'] = dt.time.dt.day
type(dt.time[0])
    
    Out[9]:
In [80]:
    
d = dt.year.value_counts()
dd = pd.DataFrame(d)
dd = dd.sort_index(axis=0, ascending=True)
ds = dd.cumsum()
    
In [81]:
    
def getDate(dat):
    dat_date_str = map(lambda x: str(x) +'-01-01', dat.index)
    dat_date = pd.to_datetime(dat_date_str)
    return dat_date
ds.date = getDate(ds)
dd.date = getDate(dd)
    
In [82]:
    
fig = plt.figure(figsize=(12,5))
plt.plot(ds.date, ds.year, 'g-s', label = '$Cumulative\: Number\:of\: Threads$')
plt.plot(dd.date, dd.year, 'r-o', label = '$Yearly\:Number\:of\:Threads$')
#plt.yscale('log')
plt.legend(loc=2,numpoints=1,fontsize=13)
plt.show()
    
    
In [98]:
    
dt.reply[:55]
    
    Out[98]:
@贾也2012-10-297:59:00 导语:人人宁波,面朝大海,春暖花开 ........
    @兰质薰心2012-10-2908:55:52  楼主好文!  相信政府一定有能力解决好这些...
            回复第20楼,@rual_f  “我相信官场中,许多官员应该葆有社会正能量”  通篇好文,顶...
In [2]:
    
import re
tweet = u"//@lilei: dd //@Bob: cc//@Girl: dd//@魏武: \
    利益所致 自然念念不忘//@诺什: 吸引优质  客户,摆脱屌丝男!!!//@MarkGreene: 转发微博"
RTpattern = r'''//?@(\w+)'''
for word in re.findall(RTpattern, tweet, re.UNICODE):
    print word
    
    
In [6]:
    
RTpattern = r'''@(\w+)\s'''
tweet = u"@lilei: dd @Bob: cc @Girl: dd @魏武: \
    利益所致 自然念念不忘 //@诺什: 吸引优质  客户,摆脱屌丝男!!!"
for word in re.findall(RTpattern, tweet, re.UNICODE):
    print word # dt.reply[11].decode('utf8'), re.UNICODE)
    
In [154]:
    
if re.findall(RTpattern, dt.reply[0].decode('utf8'), re.UNICODE):
    print True
else:
    print False
    
    
In [121]:
    
for k, tweet in enumerate(dt.reply[:100]):
    tweet = tweet.decode('utf8')
    RTpattern = r'''@(\w+)\s'''
    for person in re.findall(RTpattern, tweet, re.UNICODE):
        print k,'\t',dt.author_name[k],'\t', person,'\t\t', tweet[:30]
    
    
In [109]:
    
print dt.reply[80]
    
    
In [158]:
    
link_author_dict = {}
for i in range(len(df)):
    link_author_dict[df.link[i]] =df.author[i]
    
In [176]:
    
graph = []
for k, tweet in enumerate(dt.reply):
    tweet = tweet.decode('utf8')
    url = dt.link[k]
    RTpattern = r'''@(\w+)\s'''
    persons = re.findall(RTpattern, tweet, re.UNICODE)
    if persons:
        for person in persons:
            graph.append([dt.author_name[k].decode('utf8'), person])
    else:
        graph.append( [dt.author_name[k].decode('utf8'), link_author_dict[url].decode('utf8')]  )
    
In [177]:
    
len(graph)
    
    Out[177]:
In [178]:
    
for x, y in graph[:3]:
    print x, y
    
    
In [179]:
    
import networkx as nx
    
In [180]:
    
G = nx.DiGraph()
for x,y in graph:
    if x != y:
        G.add_edge(x,y)
    
In [181]:
    
nx.info(G)
    
    Out[181]:
In [182]:
    
GU=G.to_undirected(reciprocal=True)
graphs = list(nx.connected_component_subgraphs(GU))
    
In [185]:
    
import numpy as np
size = []
for i in graphs:
    size.append(len(i.nodes()))
len(size), np.max(size)
    
    Out[185]:
In [190]:
    
gs = []
for i in graphs:
    if len(i.nodes()) >5:
        gs.append(i)
len(gs)
    
    Out[190]:
In [191]:
    
for g in gs:
    print len(g.nodes())
    
    
In [192]:
    
g_max = gs[0]
len(g_max.nodes())
    
    Out[192]:
In [198]:
    
pos = nx.spring_layout(g_max)          
#定义一个布局,此处采用了spectral布局方式,后变还会介绍其它布局方式,注意图形上的区别
nx.draw(g_max,pos,with_labels=False,node_size = 30)  
#绘制规则图的图形,with_labels决定节点是非带标签(编号),node_size是节点的直径
plt.show()  #显示图形
    
    
In [203]:
    
with open('/Users/chengjun/github/cjc2016/data/tianya_network_120.csv', 'a') as f:
    for x, y in g_max.edges():
        f.write(x.encode('utf8') + ',' + y.encode('utf8') + '\n')
    
Gephi
In [ ]: